Libraries
suppressPackageStartupMessages({
library(tidyverse)
library(tximport)
library(ensembldb)
library(EnsDb.Mmusculus.v79)
library(EnsDb.Hsapiens.v86)
library(edgeR)
library(matrixStats)
library(gridExtra)
library(zeallot)
library(cowplot)
library(rhdf5)
library(limma)
library(gt)
library(plotly)
library(DT)
library(IsoformSwitchAnalyzeR)
library(gplots)
library(RColorBrewer)
library(rmarkdown)
library(ggplot2)
library(renv)
library(gprofiler2)
library(msigdbr)
library(clusterProfiler)
library(enrichplot)
library(grid)
library(fgsea)
library(ggpubr)
library(cowplot)
library(knitr)
}) Functions and Output Paths
get_abundance_paths <- function(abundance_root_dir, sample_labels) {
abundance_dirs <- list.dirs(abundance_root_dir)[-1]
abundance_paths <- c()
for (abundance_dir in abundance_dirs)
# retrieve only the abundance paths corresponding to the sample labels
if (basename(abundance_dir) %in% sample_labels) {
abundance_path <- paste(normalizePath(abundance_dir),
'/abundance.tsv',
sep = '')
abundance_paths <- c(abundance_paths, abundance_path)
}
stopifnot(all(file.exists(abundance_paths)))
return(abundance_paths)
}
get_transcript_to_gene_df <- function(annotation_db){
# query the ensembl db to get gene transcripts id: gene name map
tx <- transcripts(annotation_db, columns=c("tx_id", "gene_name"))
# convert to tibble and clean up
tx <- as_tibble(tx)
tx <- dplyr::rename(tx, target_id = tx_id)
tx <- dplyr::select(tx, "target_id", "gene_name")
tx <- tx[tx$gene_name != '',]
return(tx)
}
add_row_descriptive_stats <- function(df, col_subset) {
df$sd <- rowSds(as.matrix(abund[, col_subset]))
df$mean <- rowMeans(as.matrix(abund[, col_subset]))
df$median <- rowMedians(as.matrix(abund[, col_subset]))
return(df)
}
convert_tx_gene_mtx_to_tibble <- function(mtx, sample_labels) {
# add column names that are the names of the samples
stopif(is.null(sample_labels))
colnames(mtx) <- sample_labels
df <- as_tibble(mtx, rownames = 'gene')
return(df)
}
build_digital_gene_expression_list <- function(gene_counts, sample_labels) {
# @return A list containing a matrix of counts (same as what was passed in)
# and a dataframe of samples containing the total count and a
# placeholder normalization factor of 1
# subset to sample labels cols only, and convert to matrix
dge_list <- DGEList(as.matrix(gene_counts[, sample_labels]))
row.names(dge_list$counts) <- gene_counts$gene
return(dge_list)
}
build_log_cpm_df <- function(dge_list, control_label, long) {
# Construct a logged counts per million df
# uses normalized library sizes by default, as calculated earlier
log_cpm <- cpm(dge_list, log=TRUE)
log_cpm <- as_tibble(log_cpm, rownames = "gene_id")
log_cpm$gene_id <- gsub('"', "", log_cpm$gene_id)
# move control cols to the front so that later plots are
# more straightforward
log_cpm <- select(log_cpm, starts_with(control_label), everything())
log_cpm <- select(log_cpm, 'gene_id', everything())
if (long == TRUE) {
sample_cols <- colnames(log_cpm)[-1]
log_cpm <- pivot_longer(log_cpm,
cols = all_of(sample_cols),
names_to = "sample_label",
values_to = "expression")
}
return(log_cpm)
}
build_log_cpm_plot <- function(cpm_df, subtitle) {
stopifnot('sample_label' %in% colnames(cpm_df))
plt <- ggplot(cpm_df) +
aes(x=sample_label, y=expression, fill=sample_label) +
geom_violin(trim = FALSE, show.legend = FALSE) +
stat_summary(fun = "median",
geom = "point",
shape = 95,
size = 10,
color = "black",
show.legend = FALSE) +
labs(y="log2 expression", x = "sample",
title="Log2 Counts per Million (CPM)",
subtitle=subtitle,
caption=paste0("produced on ", Sys.time())) +
theme_bw()
return(plt)
}
filter_dge_list <- function(dge_list, min_cpm, min_samples_with_min_cpm) {
# get a counts per million matrix where each record is a gene and each col
# is a sample
cpm_mtx <- cpm(dge_list)
# filter dge_list
msk <- rowSums(cpm_mtx >= min_cpm) >= min_samples_with_min_cpm
dge_list <- dge_list[msk, ]
return(dge_list)
}
get_gene_counts <- function(abundance_paths, sample_labels, tx_to_gene_df) {
# params:
# abundance_paths: c vector of paths of abundance files
# tx_to_gene_df: df mapping a transcript identifier to a gene name
# returns:
# gene_counts: count of genes aggregated from txs- not exactly equal to
# a million- possibly bc some txs don't have a gene in the reference?
# gene_lengths
# gene_abunds
# get list of matrixes summarizing information across the data in abundance_paths
gene_stats_list <- tximport(file = abundance_paths,
type = "kallisto",
tx2gene = tx_to_gene_df,
txOut = FALSE, # false means gene level, not tx level
countsFromAbundance = "lengthScaledTPM",
ignoreTxVersion = TRUE,
abundanceCol = 'tpm')
# add column headers and convert to tibble
gene_counts <- convert_tx_gene_mtx_to_tibble(gene_stats_list$counts,
sample_labels)
# gene_lengths <- convert_tx_gene_mtx_to_tibble(gene_stats_list$length,
# sample_labels)
# gene_abunds <- convert_tx_gene_mtx_to_tibble(gene_stats_list$abundance,
# sample_labels)
return(gene_counts)
}
assign_abundance_paths_to_study_design <- function(study_design,
abundance_paths) {
abundance_df <- data.frame(abundance_paths)
abundance_df <- dplyr::rename(abundance_df, abundance_path = abundance_paths)
abundance_df$sample_label <-
str_replace(abundance_df$abundance_path, abundance_root_dir, '')
abundance_df$sample_label <-
str_replace(abundance_df$sample_label, '/abundance.tsv', '')
study_design <-
merge(study_design, abundance_df, by = 'sample_label', all = TRUE)
assert_col_not_null(study_design$abundance_path)
return(study_design)
}
assert_col_unique <- function(col) {
stopifnot(length(unique(col)) == length(col))
}
assert_col_not_null <- function(col) {
stopif(any(is.na(col)))
}
get_study_design_df <- function(study_design_path) {
study_design <- read_csv(study_design_path, col_types = cols(.default = 'c'))
assert_col_unique(study_design$sample_label)
return(study_design)
}
stopif <- function(logic) {
stopifnot(!logic)
}
convert_tibble_to_mtx <- function(tbl, row_names_col) {
mtx <- as.matrix(tbl)
# set rownames
row.names(mtx) = mtx[, row_names_col]
# remove row name col
mtx <- mtx[, ! colnames(mtx) %in% c(row_names_col)]
return(mtx)
}
plot_sample_cluster_dendogram <- function(tbl, sample_labels, sample_cluster_out_path,
write_output) {
## writes a dendogram cluster plot for a gene level dataset where
## the headers are sample_labels
# build matrix since distance function requires is
mtx <- convert_tibble_to_mtx(tbl, "gene_id")
stopifnot(setequal(colnames(mtx), sample_labels))
# calc distance between samples
distance <- dist(t(mtx), method = "maximum")
# agglomeration
clusters <- hclust(distance, method = "average")
# build dendogram and write
if (write_output) {
pdf(sample_cluster_out_path)
plot(clusters, labels=sample_labels)
dev.off()
} else {
plot(clusters, labels=sample_labels)
}
}
plot_pca_scatter <- function(pca_metrics, sample_dimensions,
study_design, pca_out_path, write_output) {
sample_labels <- study_design$sample_label
plot_list = list()
for(sample_dimension in sample_dimensions) {
# build factor for coloring
sample_dimension_factor <- factor(study_design[, sample_dimension])
# length 1 means factor failed, e.g. doesn't work on tibbles
stopif(length(sample_dimension_factor) == 1)
# use just odd elements of range so that PCs aren't repeated across plots
for(i in seq(1, 4, 2)) {
first_pc_label = paste('PC', i, sep='')
second_pc_label = paste('PC', i + 1, sep='')
plt <- ggplot(pca_metrics$scores) +
aes(x=get(first_pc_label), y=get(second_pc_label),
label=sample_labels, color=sample_dimension_factor) +
geom_point(size=4) +
# geom_label(nudge_y = 10) +
xlab(paste0(first_pc_label, "(", pca_metrics$var_explained[i]*100,"%",")")) +
ylab(paste0(second_pc_label, "(", pca_metrics$var_explained[i + 1]*100,"%",")")) +
labs(title="Sample PCA") +
coord_fixed() +
# overwrites legend title to be the actual dimension label
guides(color=guide_legend(sample_dimension)) +
theme_bw()
# print plt so that recordPlot can capture it
print(plt)
plot_list[[paste(i, sample_dimension, sep='')]] <- recordPlot()
}}
if (write_output==TRUE) {
pdf(pca_out_path, onefile=TRUE, width=4, height=4)
for (plt in plot_list) {
replayPlot(plt)
}
graphics.off()
}
}
get_pca_metrics <- function(cpm_matrix) {
## Performs Principal Component Analysis on genetic counts per million
## sample data
## Returns:
## variance explained: numeric series like object with pct variance described
## by each PC
## scores: tibble, shows how much each sample influenced each PC
## loadings: show much each variable (gene) influenced each PC
# [-1] removes gene_id column
pca <- prcomp(t(cpm_matrix[-1]), scale.=F, retx=T)
pca_sum <- summary(pca)
var_explained <- as.data.frame(t(pca_sum$importance))$'Proportion of Variance'
scores <- as_tibble(pca$x)
loadings <- pca$rotation
pca_metrics <- list(var_explained = var_explained,
loadings = loadings,
scores = scores
)
return(pca_metrics)
}
plot_pca_small_multiples <- function(pca_metrics,
sample_dimensions,
study_design,
pca_small_multiples_out_path,
write_output) {
sample_labels <- study_design$sample_label
scores <- pca_metrics$scores[, 1:6]
plot_list = list()
for (sample_dimension in sample_dimensions) {
sample_dim_vals <- study_design[, sample_dimension]
# prep and pivot long
pca_pivot <- add_column(scores, sample = sample_labels,
group = sample_dim_vals)
pca_pivot <- pivot_longer(
pca_pivot,
cols = PC1:PC6,
names_to = "PC",
values_to = "scores"
)
# build small multiples plot
plt <- ggplot(pca_pivot) +
aes(x = sample,
y = scores,
fill = group) +
geom_bar(stat = "identity") +
facet_wrap( ~ PC) +
labs(title = "PCA small multiples plot",
caption = paste0("produced on ", Sys.time())) +
theme_bw() +
coord_flip() +
guides(color = guide_legend(sample_dimension))
# print plt so that recordPlot can capture it
print(plt)
plot_list[[sample_dimension]] <- recordPlot()
}
if (write_output==TRUE) {
pdf(pca_small_multiples_out_path, onefile = TRUE)
for (plt in plot_list) {
replayPlot(plt)
}
graphics.off()
}
}
describe <- function(df_col) {
min_val <- min(df_col, na.rm = T)
max_val <- max(df_col, na.rm = T)
mean_val <- mean(df_col, na.rm = T)
q5 <- quantile(df_col, 0.05, na.rm = T)
q10 <- quantile(df_col, 0.10, na.rm = T)
q25 <- quantile(df_col, 0.25, na.rm = T)
q50 <- quantile(df_col, 0.50, na.rm = T)
q75 <- quantile(df_col, 0.75, na.rm = T)
q90 <- quantile(df_col, 0.90, na.rm = T)
q95 <- quantile(df_col, 0.95, na.rm = T)
out <- data.frame(min=min_val, max=max_val, mean=mean_val,
q5=q5, q10=q10, q25=q25, q50=q50, q75=q75,
q90=q90, q95=q95, row.names = NULL)
out <- t(out)
return(out)
}
plot_impact_of_filtering_and_normalizing <-
function(
dge_list,
dge_list_filt,
dge_list_filt_norm,
control_label,
filtering_and_normalizing_impact_out_path,
write_output
) {
log_cpm_long <-
build_log_cpm_df(dge_list, control_label, long = TRUE)
log_cpm_filt_long <-
build_log_cpm_df(dge_list_filt, control_label, long = TRUE)
log_cpm_filt_norm_long <-
build_log_cpm_df(dge_list_filt_norm, control_label, long = TRUE)
plt_1 <- build_log_cpm_plot(log_cpm_long, "unfiltered, non-normalized")
plt_2 <- build_log_cpm_plot(log_cpm_filt_long, "filtered, non-normalized")
plt_3 <- build_log_cpm_plot(log_cpm_filt_norm_long, "filtered, normalized")
all_plts <- plot_grid(plt_1, plt_2, plt_3,
labels = c('A', 'B', 'C'),
label_size = 12)
if (write_output){
ggsave(file=log_cpm_filter_norm_out_path, all_plts)
} else {
print(all_plts)
}
}
get_design_matrix <- function(study_design, has_intercept,
explanatory_variable) {
variable_factor <- factor(study_design[, explanatory_variable])
if (has_intercept == TRUE) {
design_matrix <- model.matrix(variable_factor)
} else {
design_matrix <- model.matrix(~0 + variable_factor)
}
# interaction
# design_matrix <- model.matrix(~age_factor*population_factor)
# additive
# design_matrix <- model.matrix(~age_factor + population_factor)
# better column names
prefix <- paste(explanatory_variable, '_', sep='')
colnames(design_matrix) <- sub("variable_factor", prefix, colnames(design_matrix))
return(design_matrix)
}
get_topTable_dge <- function(bayes_stats,
multiple_testing_correction_method,
min_lfc,
max_p_val) {
# adjust p values and get dataframe of genes sorted by abs log fold change,
sig_dge <- topTable(
bayes_stats,
adjust = multiple_testing_correction_method,
coef = 1,
number = 999999,
lfc = min_lfc
)
sig_dge <- as_tibble(sig_dge, rownames='gene_id')
sig_dge <- dplyr::filter(sig_dge, adj.P.Val <= max_p_val)
return(sig_dge)
}
plot_dge_volcano <- function(dge,
title,
dge_volcano_out_path,
write_output) {
vplot <- ggplot(dge) +
aes(y = -log10(adj.P.Val),
x = logFC,
text = paste("Symbol:", gene_id)) +
geom_point(size = 2) +
labs(title = title,
# subtitle = "Insert Subtitle",
caption = paste0("produced on ", Sys.time())) +
theme_bw()
# write interactive plot
vplot <- plotly::ggplotly(vplot)
if (write_output) {
htmlwidgets::saveWidget(vplot, dge_volcano_out_path)
} else {
vplot
}
}
build_datatable <- function(data, caption_text) {
dtbl <- datatable(
data,
extensions = c('KeyTable', "FixedHeader"),
caption = caption_text,
rownames = FALSE,
selection = 'multiple',
filter = 'top',
options = list(
keys = TRUE,
searchHighlight = TRUE,
pageLength = 10,
orderMulti = TRUE,
# scrollX = '400px',
scrollX = TRUE,
lengthMenu = c("10", "25", "50", "100")
))
return(dtbl)
}
plot_dge_datatable <- function(all_dge,
sig_dge_datatable_out_path,
write_output) {
all_dge <- dplyr::select(all_dge, -any_of(c('t', 'P.Value', 'B', 'AveExpr')))
# add flag for if logfc is statistically significant
all_dge$significant <- with(all_dge, ifelse(adj.P.Val <= .05, 'True', 'False'))
all_dge <- relocate(all_dge, significant, .after = adj.P.Val)
dtable <- build_datatable(all_dge,
'Gene Expression (counts are log2cpm)')
round_cols <-
names(dtable$x$data)[!names(dtable$x$data) %in% c('gene_id', 'significant')]
dtable <- formatRound(dtable, columns = round_cols, digits = 2)
if (write_output) {
htmlwidgets::saveWidget(dtable, sig_dge_datatable_out_path)
} else {
dtable
}
}
get_mean_variance_weights <- function(dge_list_filt_norm,
design_matrix) {
# voom requires the input to be counts, not CPM, TPM etc
# performs log2cpm of the counts then variance stabilizes them,
# then estimates the mean-variance relationship and produces observation
# level weights for linear modelling
# object is an Expression List (EList)
mean_variance_weights <- voom(dge_list_filt_norm, design_matrix,
# plot=TRUE,
save.plot = TRUE)
return(mean_variance_weights)
}
plot_mean_variance_distribution <- function(mean_variance_weights,
mean_variance_plot_out_path,
write_output){
# points
df = data.frame(x = mean_variance_weights$voom.xy$x,
y = mean_variance_weights$voom.xy$y)
# trend line
df.line = data.frame(x = mean_variance_weights$voom.line$x,
y = mean_variance_weights$voom.line$y)
plt <-
ggplot(df, aes(x,y)) +
geom_point(size=.1) +
theme_bw(15) +
geom_line(data=df.line, aes(x,y), color="red") +
ylim(0, max(df$y))
interactive_plot <- plotly::ggplotly(plt)
# add titles/labels here due to bug in ggplotly
interactive_plot <- plotly::layout(interactive_plot,
title='Gene Mean vs Variance Across Samples',
xaxis=list(title='log2 count + 0.5'),
yaxis=list(title="Sqrt(standard deviation)"),
margin=list(l=50, r=50, b=100, t=100))
if (write_output) {
htmlwidgets::saveWidget(interactive_plot, mean_variance_plot_out_path)
} else {
interactive_plot
}
}
get_empirical_bayes_differential_expression_stats <-
function(mean_variance_weights, explanatory_variable,
experimental_label, control_label) {
# builds a linear model with coefficients for each column in design matrix
# each row is a gene, each value is the average of the transformed
# counts from voom for that category/gene combo, if explanatory variables
# are factors
linear_stats <- lmFit(mean_variance_weights, design_matrix)
# makes matrix representing the contrasts that will be evaluated
experimental_column <- paste0(explanatory_variable, '_', experimental_label)
control_column <- paste0(explanatory_variable, '_', control_label)
contrast_string <- paste0(experimental_column, '-', control_column)
contrast_matrix <- makeContrasts(contrasts=contrast_string,
levels=design_matrix)
# coefficients here are the differences between the coefficients of each
# category in the contrast
contrast_stats <- contrasts.fit(linear_stats, contrast_matrix)
# uses empirical bayes method to produce p values and other statistics
# showing whether each gene has a log fold change greater than 0
bayes_stats <- eBayes(contrast_stats)
return(bayes_stats)
}
get_sig_dif_expressed_genes <- function(bayes_stats,
multiple_testing_correction_method,
significance_function,
min_lfc,
max_p_val
) {
if (significance_function == 'decideTests') {
stop('decideTests deg creation not fully implemented yet')
#TODO dge output needs logFC attached to it, which the decideTests output
# does not have
# using bayes_stats- the fitted model object (that includes the
# contrast matrix)- get a gene-level table showing whether the gene was
# significantly negative, sig positive, or not sig with respect to each
# coefficient in the contrast matrix
dge <- decideTests(
bayes_stats,
method = "global",
adjust.method = multiple_testing_correction_method,
p.value = max_p_val,
lfc = min_lfc
)
dge <- data.frame(dge)
dge <- as_tibble(dge, rownames = 'gene_id')
dge$gene_id <- gsub('"', "", dge$gene_id)
} else if (significance_function == 'topTable') {
dge <- get_topTable_dge(bayes_stats,
multiple_testing_correction_method,
min_lfc,
max_p_val)
}
else {
stop('Invalid significance function')
}
return(dge)
}
get_clusters <- function(cluster_type, sig_dge_mtx) {
# TODO consider using the clust command line tool instead
# - different clustering algorithms produce very different clusters
# - clust uses an ensembl method to get the consensus clustering across
# multiple clustering algorithms
# https://genomebiology.biomedcentral.com/articles/10.1186/s13059-018-1536-8
stopifnot(cluster_type %in% c('gene', 'sample'))
if (cluster_type == 'gene') {
# use pearson bc gene expression is continuous
cor_mtx <- cor(t(sig_dge_mtx), method='pearson')
#TODO comment on why as.dist needs to be done twice
# (1-cor is to make the vals be 0 to 2)
cor_dist <- as.dist(as.dist(1-cor_mtx))
} else if (cluster_type == 'sample') {
# spearman bc samples discrete and should be ranked
cor_mtx <- cor(sig_dge_mtx, method='spearman')
cor_dist <- (as.dist(1-cor_mtx))
}
clust <-hclust(cor_dist, method='complete')
}
prep_dge_subset_for_heatmap <- function(dge_subset, log_cpm_filt_norm) {
# subset to just sample logfc cols to do cluster correlations
dge_subset <- dge_subset[, colnames(log_cpm_filt_norm)]
row.names(dge_subset) <- dge_subset$gene_id
dge_subset <- subset(dge_subset, select=-c(gene_id))
dge_subset <- as.matrix(dge_subset)
return(dge_subset)
}
get_dge_subset_by_gene_set_for_heatmap <- function(dge, gene_set,
log_cpm_filt_norm) {
dge_subset <- dplyr::filter(dge, gene_id %in% gene_set$gene_symbol)
dge_subset <- prep_dge_subset_for_heatmap(dge_subset, log_cpm_filt_norm)
return(dge_subset)
}
get_most_expressed_dge_subset_for_heatmap <- function(dge, log_cpm_filt_norm) {
# subset to genes with highest/lowest lfc if necessary
if (nrow(dge) > 20) {
dge_high_lfc <- head(dge[order(-dge$logFC),], 10)
dge_low_lfc <- head(dge[order(dge$logFC),], 10)
dge_subset <- rbind(dge_high_lfc, dge_low_lfc)
} else {
dge_subset <- dge
}
dge_subset <- prep_dge_subset_for_heatmap(dge_subset, log_cpm_filt_norm)
return(dge_subset)
}
get_gene_cluster_colors <- function(gene_clust) {
# group the clusters, k is the number of sample categories
gene_clust_groups <- cutree(gene_clust, k=2)
# convert the cluster groups to colors
gene_clust_colors <- rainbow(length(unique(gene_clust_groups)),
start=0.1, end=0.9)
gene_clust_colors <- gene_clust_colors[as.vector(gene_clust_groups)]
return(gene_clust_colors)
}
plot_gene_cluster_heatmap <- function(dge, log_cpm_filt_norm, gene_set=NULL) {
if (is.null(gene_set)) {
dge_subset <-
get_most_expressed_dge_subset_for_heatmap(dge, log_cpm_filt_norm)
plot_title <- paste('Gene Cluster Heatmap for up to',
'10 highest/lowest LFC genes',
sep= "\n")
} else {
gene_set_name <-unique(gene_set$gs_name)
stopifnot(length(gene_set_name) == 1)
dge_subset <-
get_dge_subset_by_gene_set_for_heatmap(dge, gene_set, log_cpm_filt_norm)
plot_title <- paste('Gene Cluster Heatmap for',
gene_set_name, sep= "\n")
}
gene_clust <- get_clusters('gene', dge_subset)
sample_clust <- get_clusters('sample', dge_subset)
gene_clust_colors <- get_gene_cluster_colors(gene_clust)
heat_colors <- rev(brewer.pal(name="RdBu", n=11))
# dge static heatmap , scale='row' computes z score that
# scales the expression of the rows (genes) to better highlight
# between gene differences
gene_scaling_heatmap <-
heatmap.2(dge_subset,
Rowv = as.dendrogram(gene_clust),
dendrogram='row',
# Colv = as.dendrogram(sample_clust),
Colv = FALSE,
RowSideColors = gene_clust_colors,
col = heat_colors, scale = 'row',
srtCol = 45,
density.info = 'none', trace = 'none',
cexRow = 1, cexCol = 1,
# margins = c(4, 4),
main=plot_title)
# same as above except no row-wise scaling, making the
# between sample differences more obvious
# sample_scaling_heatmap <-
# heatmap.2(sig_dge_subset,
# Rowv = as.dendrogram(gene_clust),
# Colv = as.dendrogram(sample_clust),
# RowSideColors = gene_clust_colors,
# col = heat_colors, scale = 'none',
# srtCol = 45,
# density.info = 'none', trace = 'none',
# cexRow = 1, cexCol = 1, margins = c(8, 8),
# main=paste('Gene Cluster Heatmap (sample scaling)',
# 'for up to 10 highest/lowest LFC genes',
# sep= "\n"))
# if (write_output) {
# png(gene_cluster_heatmap_sample_scaling_out_path)
# dev.off()}
}
get_dge_list_filt_norm <- function(gene_counts, sample_labels, min_cpm,
min_samples_with_min_cpm) {
dge_list <-
build_digital_gene_expression_list(gene_counts, sample_labels)
dge_list_filt <- filter_dge_list(dge_list,
min_cpm = min_cpm,
min_samples_with_min_cpm = min_samples_with_min_cpm)
# adjusts norm factors in dge list samples df, allows comparison across samples
dge_list_filt_norm <-
calcNormFactors(dge_list_filt, method = 'TMM')
return(list(dge_list, dge_list_filt, dge_list_filt_norm))
}
get_msig_gene_sets <- function(species) {
msig_hallmarks <- get_msig_hallmark_labels_of_interest()
# load the gene ontology sets, and filter to the ones that are a part of
# the hallmarks we care about
msig_gene_sets <- msigdbr(species=species, category='H') # H for hallmarks
msig_gene_sets <- dplyr::filter(msig_gene_sets, gs_name %in% msig_hallmarks)
msig_gene_sets <- dplyr::select(msig_gene_sets, gs_name, gene_symbol)
return(msig_gene_sets)
}
plot_gost_gene_set_enrichment <- function(sig_dge, organism, out_path,
write_output, title) {
# Builds and writes an interactive plot showing functional
# enrichment by various categories. A gene set is considered functionally
# enriched if it statstically (p-value) significantly enriched compared to
# all other genes. Uses Gene Ontology mappings (http://geneontology.org/)
# sig_dge: dataframe of differentially expressed genes
unique_genes <- unique(sig_dge$gene_id)
# TODO determine what correction method is typically used and use that
# default args return significantly enriched gene sets at a 0.05 threshold
gost_res <- gost(unique_genes,
organism = organism,
correction_method = "fdr",
sources= c('GO:BP', 'GO:MF', 'GO:CC'))
out_plot <- gostplot(gost_res, interactive = T, capped = T)
out_plot <- layout(out_plot, title=title)
if (write_output) {
htmlwidgets::saveWidget(out_plot, out_path)
} else {
out_plot
}
}
get_msig_hallmark_labels_of_interest <- function() {
msig_hallmarks <- c(
'HALLMARK_TNFA_SIGNALING_VIA_NFKB',
'HALLMARK_ALLOGRAFT_REJECTION',
'HALLMARK_COAGULATION',
'HALLMARK_INFLAMMATORY_RESPONSE',
'HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION',
'HALLMARK_P53_PATHWAY',
'HALLMARK_IL6_JAK_STAT3_SIGNALING',
'HALLMARK_KRAS_SIGNALING_UP',
'HALLMARK_INTERFERON_GAMMA_RESPONSE',
'HALLMARK_APOPTOSIS',
'HALLMARK_COMPLEMENT',
'HALLMARK_KRAS_SIGNALING_DN',
'HALLMARK_ANGIOGENESIS',
'HALLMARK_XENOBIOTIC_METABOLISM',
'HALLMARK_APICAL_SURFACE',
'HALLMARK_APICAL_JUNCTION',
'HALLMARK_HYPOXIA',
'HALLMARK_TGF_BETA_SIGNALING',
'HALLMARK_UV_RESPONSE_UP',
'HALLMARK_MYOGENESIS',
'HALLMARK_E2F_TARGETS',
'HALLMARK_G2M_CHECKPOINT',
'HALLMARK_MITOTIC_SPINDLE',
'HALLMARK_MYC_TARGETS_V1',
'HALLMARK_SPERMATOGENESIS',
'HALLMARK_ANDROGEN_RESPONSE',
'HALLMARK_MTORC1_SIGNALING',
'HALLMARK_INFLAMMATORY_RESPONSE',
'HALLMARK_P53_PATHWAY'
)
return(msig_hallmarks)
}
get_go_gene_sets <- function(ont) {
# ont is GO category (ontology)
gene_set_list <- getGO(org='mouse', ont=ont)
# merge human readable gene set name onto gene set df
gene_sets <- merge(gene_set_list$geneset, gene_set_list$geneset_name,
by=1, all.x = TRUE)
# cleanup
gene_sets <- dplyr::rename(gene_sets, gs_name=Term, gene_symbol=gene)
gene_sets <- dplyr::select(gene_sets, 'gs_name', 'gene_symbol')
gene_sets <- as_tibble(gene_sets)
gene_sets <- drop_na(gene_sets)
return(gene_sets)
}
get_custom_gene_sets <- function(custom_gene_sets_path) {
gene_sets_custom <- read_csv(custom_gene_sets_path, show_col_types = FALSE)
return(gene_sets_custom)
}
get_gene_sets <- function(custom_gene_sets_path) {
# get msig gene sets
gene_sets <- get_msig_gene_sets('Mus musculus')
# get go gene sets, concat to msig gene sets
go_bp_gene_sets <- msigdbr(species = "Mus musculus", category = "C5",
subcategory = "BP")
go_bp_gene_sets <- dplyr::select(go_bp_gene_sets, "gs_name", "gene_symbol")
gene_sets <- rbind(gene_sets, go_bp_gene_sets)
# if there are custom gene sets, attach them too
if (!is.null(custom_gene_sets_path)) {
gene_sets_custom <- get_custom_gene_sets(custom_gene_sets_path)
gene_sets <- rbind(gene_sets, gene_sets_custom)
}
return(gene_sets)
}
filter_gsea_df_to_most_sig_pos_and_neg_enriched_pathways <- function(gsea_df) {
gsea_df_down <- dplyr::filter(gsea_df, NES<0)
gsea_df_down <- head(gsea_df_down[order(gsea_df_down$p.adjust),], 10)
gsea_df_up <- dplyr::filter(gsea_df, NES>=0)
gsea_df_up <- head(gsea_df_up[order(gsea_df_up$p.adjust),], 10)
gsea_df_filtered <- rbind(gsea_df_down, gsea_df_up)
return(gsea_df_filtered)
}
get_fgsea_gene_set_input <- function(gene_sets) {
# convert to named list
gene_sets_fgsea <- split(gene_sets$gene_symbol, gene_sets$gs_name)
return(gene_sets_fgsea)
}
get_fgsea_input <- function(all_dge) {
fgsea_input <- dplyr::select(all_dge, 'gene_id', 'logFC')
# Todo convert this to an assertion that it's unique by gene and
# add .0001 to logFC that are the same to distinguish them
fgsea_input <- distinct(fgsea_input, gene_id, .keep_all=TRUE) %>%
distinct(logFC, .keep_all=TRUE)
# order descending by the ranking metric although it doesn't seem
# to make a big difference for fgsea
fgsea_input <- fgsea_input[order(fgsea_input$logFC),]
# convert to named list
fgsea_input <- deframe(fgsea_input)
return(fgsea_input)
}
get_fgsea_response_df <- function(gene_sets_fgsea, fgsea_input) {
fgsea_res <- fgsea(pathways=gene_sets_fgsea, stats=fgsea_input)
fgsea_df <- as_tibble(fgsea_res)
return(fgsea_df)
}
plot_fgsea <- function(gene_sets_fgsea_filtered, fgsea_df, fgsea_input,
gsea_plot_grid_title_text) {
plot_list = list()
for (gs_name in names(gene_sets_fgsea_filtered)) {
adj_p_value = fgsea_df[fgsea_df$pathway == gs_name,]$padj
adj_p_value = format(round(adj_p_value, digits=4), nsmall=4)
nes <- fgsea_df[fgsea_df$pathway == gs_name,]$NES
nes <- format(round(nes, digits=3), nsmall=3)
subtitle_str <- paste0('(FDR PVal: ', adj_p_value, ', NES: ', nes, ')')
title_str <- paste(gs_name, subtitle_str, sep='\n')
plt <- plotEnrichment(pathway=gene_sets_fgsea_filtered[[gs_name]],
stats=fgsea_input) +
labs(title=title_str) +
theme(plot.title = element_text(hjust = 0.5, size=12),
# plot.margin = margin(0, 0, 0, 0)
) +
xlab('Rank by LogFC')
plot_list[[gs_name]] <- plt
}
ncol <- min(length(plot_list), 2)
nrow <- floor(length(plot_list)/2) + 1
# heights <- rep(unit(7, 'cm'), nrow)
out <- plot_grid(plotlist=plot_list, nrow=nrow, ncol=ncol)
title <- ggdraw() + draw_label(gsea_plot_grid_title_text, fontface='bold')
print(plot_grid(title, out, ncol=1, rel_heights=c(0.1, 1)))
}
build_and_plot_fgsea <- function(gene_sets, all_dge, grid_title_text) {
gene_sets_fgsea <- get_fgsea_gene_set_input(gene_sets)
fgsea_input <- get_fgsea_input(all_dge)
fgsea_df <- get_fgsea_response_df(gene_sets_fgsea, fgsea_input)
fgsea_df <- dplyr::select(fgsea_df, -any_of(c('pval', 'log2err')))
dtbl <- build_datatable(fgsea_df, paste0(grid_title_text, ' Table'))
dtbl <- formatRound(dtbl, columns = c('padj', 'ES', 'NES'), digits = 4)
if (nrow(fgsea_df) > 0) {
fgsea_df_down <- dplyr::filter(fgsea_df, NES<0, padj<.05)
fgsea_df_down <- head(fgsea_df_down[order(fgsea_df_down$padj),], 5)
fgsea_df_up <- dplyr::filter(fgsea_df, NES>=0, padj<.05)
fgsea_df_up <- head(fgsea_df_up[order(fgsea_df_up$padj),], 5)
fgsea_df_filtered <- rbind(fgsea_df_down, fgsea_df_up)
gene_sets_fgsea_filtered <- gene_sets_fgsea[fgsea_df_filtered$pathway]
gsea_plot_grid_title_text <- paste0(grid_title_text, ' (Most Significant High/Low NES)')
plot_fgsea(gene_sets_fgsea_filtered, fgsea_df, fgsea_input, gsea_plot_grid_title_text)
}
dtbl
}
simple_datatable <- function(df, round_cols = NULL) {
dtable <- datatable(df, options = list(dom='t'), rownames=FALSE)
if (!is.null(round_cols)) {
dtable <- formatRound(dtable, round_cols, digits = 2)
}
dtable
}
# import configuration information and write to disk for recordkeeping
source('./bulk_rna_seq_analysis/config.R')
capture.output(cfgs[[run]], file=paste0(output_dir, "config.csv"))
# Output Paths (only used if not knitting to rmarkdown)
filtering_and_normalizing_impact_out_path <-
paste0(output_dir, "filter_norm_impact.pdf")
pca_scatter_out_path <-
paste0(output_dir, "pca_scatter.pdf")
pca_small_multiples_out_path <-
paste0(output_dir, "pca_small_multiples.pdf")
pca_scatter_ext_out_path <-
paste0(output_dir, "pca_scatter_ext.pdf")
pca_small_multiples_ext_out_path <-
paste0(output_dir, "pca_small_multiples_ext.pdf")
sample_cluster_out_path <-
paste0(output_dir, "sample_cluster.pdf")
dge_volcano_out_path <-
paste0(output_dir, "dge_volcano.html")
dge_volcano_sig_out_path <-
paste0(output_dir, "dge_volcano_sig.html")
dge_csv_out_path <-
paste0(output_dir, "dge_table.csv")
all_dge_csv_out_path <-
paste0(output_dir, "all_dge_table.csv")
sig_dge_datatable_out_path <-
paste0(output_dir, "dge_table.html")
isoform_analysis_out_dir <-
paste0(output_dir, "isoform_analysis/")
mean_variance_plot_out_path <-
paste0(output_dir, "mean_variance_trend.png")
gene_cluster_heatmap_gene_scaling_out_path <-
paste0(output_dir, "gene_cluster_heatmap_gene_scaling.png")
gene_cluster_heatmap_sample_scaling_out_path <-
paste0(output_dir, "gene_cluster_heatmap_sample_scaling.png")
gost_plot_up_path <-
paste0(output_dir, 'gene_ontology_plot_up.html')
gost_plot_down_path <-
paste0(output_dir, 'gene_ontology_plot_down.html')
gsea_table_path <-
paste0(output_dir, 'gsea_table.xlsx')
gsea_line_plot_path <-
paste0(output_dir, 'gsea_line_plot.pdf')
gsea_bubble_plot_path <-
paste0(output_dir, 'gsea_bubble_plot.pdf')Overview
This report contains a differential gene expression analysis of bulk RNA-seq data, as well as the R code used to perform that analysis. Upstream downloading, aligning, and QC of the bulk RNA-seq data was performed using kallisto and other command line tools. The assembly and execution of those commands was handled by Python scripts. All of this code is available at https://github.com/awmundy/bio .
This codebase is generalized to act as an analysis pipeline for a wide range of bulk RNA-Seq experiments. Small changes to configuration files and the construction of a simple study design file are all that is needed to produce a similar analysis on different data.
Replication Description
cat(output_description)This report contains a replication of the differential gene expression analysis comparing young vs old mouse liver cells in the following Duan et al paper: https://www.nature.com/articles/s43587-022-00348-z
Data Flow
The first section of the data flow is handled by Python scripts which store the run configuration details and construct/execute the cli commands.
- The process begins with downloading the reference transcriptome
(fasta) and sequence data (fastq) from the experiment
- Kallisto is then used to psuedoallign the fastq files and produce
transcript level abundances
- Various QC tools are then used to produce QC reports for review
The remainder of the data flow is handled by R scripts.
- Each major step is described in this report next to the relevant R
functions
- More detailed information on the mechanics and motivation behind
each of these steps is provided at https://github.com/awmundy/bio/blob/main/documentation/data_transforms.md
- The final step is the generation of this html report using knitr
# error=FALSE is required but there is not actually an error
include_graphics(paste0(getwd(), "/documentation/pipeline_flowchart.png"),
dpi = 110, error = FALSE)Data Preparation
- The study design file is read in
- A design matrix is constructed according to the specifications in a
configuration file
- The study design and design matrix contain information about our samples, our variable of interest, and our model specification
Study Design
study_design <- get_study_design_df(study_design_path)
simple_datatable(study_design)sample_labels <- study_design$sample_label
abundance_paths <- get_abundance_paths(abundance_root_dir, sample_labels)
study_design <- assign_abundance_paths_to_study_design(study_design, abundance_paths)
design_matrix <- get_design_matrix(study_design, FALSE, explanatory_variable)Design Matrix
simple_datatable(design_matrix)abundance_paths <- study_design$abundance_pathAggregating Counts to Gene Level
- A transcript to gene mapping is downloaded from Ensembl
- Transcript level abundances are then read in and aggregated to gene level using this mapping
A few records of these tables are printed below
# Read abundances and build digital gene expression lists
tx_to_gene_df <- get_transcript_to_gene_df(EnsDb.Mmusculus.v79)Transcript to Gene Mapping
simple_datatable(head(tx_to_gene_df))gene_counts <- get_gene_counts(abundance_paths, sample_labels, tx_to_gene_df)Gene Counts
simple_datatable(head(gene_counts), sample_labels)Filtering, Normalizing, Converting to LogCPM
Various functions in the edgeR library are then used to further prepare the counts.
- The counts are stored in a digital gene expression list object
- These counts are then filtered according to the minimum counts per
million thresholds specified in the configuration file
- Normalization factors are produced for each sample using Trimmed
Mean of M values (TMM) that account for differences in library sizes and
for “crowding out” by highly expressed genes
- The counts are then converted to log2 counts per million, allowing counts from different samples to be compared
# Normalization, Filtering, Logging, Converting to Counts per Million
c(dge_list, dge_list_filt, dge_list_filt_norm) %<-%
get_dge_list_filt_norm(gene_counts, sample_labels, min_cpm,
min_samples_with_min_cpm)Filtered, Log2CPM counts
log_cpm_filt_norm <- build_log_cpm_df(dge_list_filt_norm, control_label,
long = FALSE)
simple_datatable(head(log_cpm_filt_norm), sample_labels)Differential Gene Expression
The limma library is then used to identify differential gene expression.
- Weights are produced for each sample/gene combination
- These weights are used to handle the heteroskedasticity present in
gene log counts (the variance is typically lower for higher log count
genes)
- We don’t want heteroskedasticity in our data because we want to be
able to identify differential expression equally well for both high and
low count genes.
- These weights are used to handle the heteroskedasticity present in
gene log counts (the variance is typically lower for higher log count
genes)
- A linear model is then fit at the gene level to the weighted counts
- The model variables are defined in the design matrix
- The coefficients for this model are differenced according to the
design matrix
- i.e. if looking for the change in gene expression of the
experimental group with respect to the control group, the control
coefficient would be subtracted from the experimental coefficient
- The model variables are defined in the design matrix
- An empirical bayes calculation for each gene is then performed,
testing for whether there was a significant logfold change in
expression
- P values from the previous step are corrected for multiple testing and the significantly differentially expressed genes are identified
# Build Mean/Variance Weights Across Samples for Each Gene
mean_variance_weights <- get_mean_variance_weights(dge_list_filt_norm,
design_matrix)
# Build Differential Gene Expression Dataframe
bayes_stats <-
get_empirical_bayes_differential_expression_stats(mean_variance_weights,
explanatory_variable,
experimental_label,
control_label)
sig_dge <- get_sig_dif_expressed_genes(bayes_stats,
multiple_testing_correction_method,
'topTable',
min_lfc = 0,
max_p_val = 0.05)
all_dge <- get_sig_dif_expressed_genes(bayes_stats,
multiple_testing_correction_method,
'topTable',
min_lfc = 0,
max_p_val = 1.0)Differential Gene Expression Volcano Plots
plot_dge_volcano(sig_dge,
'Significantly Differentially Expressed Genes',
dge_volcano_sig_out_path,
write_output)Differential Gene Expression Table
# merge gene level df with sample cols with gene level df with significance info
sig_dge <- merge(sig_dge, log_cpm_filt_norm, by='gene_id', all.x = TRUE)
all_dge <- merge(all_dge, log_cpm_filt_norm, by='gene_id', all.x = TRUE)
# throw error if any row of sig_dge failed to find a merge
stopif(sum(is.na(sig_dge[colnames(log_cpm_filt_norm[2])])) > 0)
write_csv(sig_dge, file=dge_csv_out_path)
write_csv(all_dge, file=all_dge_csv_out_path)
plot_dge_datatable(all_dge, sig_dge_datatable_out_path, write_output)Gene Set Enrichment Analysis
Next, Gene Set Enrichment Analysis (GSEA) is performed. This identifies biologically relevant sets of genes that are differentally expressed. Even when few or none of the genes in a set are individually significantly differentially expressed, the set as a whole can nevertheless be found to be if the genes generally exhibit differential expression in the same direction.
- First, custom gene sets that have been hand curated are
analyzed
- Then, gene sets identified by MSIG have the same analysis
performed
- To reduce clutter, only the most significantly up and downregulated
gene sets are plotted, although all are included in the table
- Heatmaps are then produced providing more detail about the
differential expression characteristics of each gene in each gene
set
- Gene sets defined by the GO Consortium are then plotted according to their significance and their GO defined aspect
GSEA for Custom Gene Sets
gene_sets_custom <- get_custom_gene_sets(custom_gene_sets_path)
build_and_plot_fgsea(gene_sets_custom, all_dge, 'GSEA for Custom Gene Sets')GSEA for MSIG Gene Sets
gene_sets_msig <- get_msig_gene_sets('Mus musculus')
build_and_plot_fgsea(gene_sets_msig, all_dge, 'GSEA for MSIGDB Gene Sets')Gene Cluster Differential Expression Heatmap for Custom Gene Sets
gene_sets_custom <- get_custom_gene_sets(custom_gene_sets_path)
for (gene_set_label in unique(gene_sets_custom$gs_name)) {
gene_set <-
dplyr::filter(gene_sets_custom, gene_sets_custom$gs_name == gene_set_label)
plot_gene_cluster_heatmap(all_dge, log_cpm_filt_norm, gene_set)
}Gene Cluster Differential Expression Heatmap for Highest/Lowest LFC Genes
plot_gene_cluster_heatmap(sig_dge, log_cpm_filt_norm)Gene Ontology GOST plots
# split out into upregulated and downregulated sets
sig_dge_up <- dplyr::filter(sig_dge, logFC >= 0)
sig_dge_down <- dplyr::filter(sig_dge, logFC < 0)
plot_gost_gene_set_enrichment(sig_dge_up, 'mmusculus',
gost_plot_up_path, write_output,
'Significantly Upregulated Pathways')Sys.sleep(5)
plot_gost_gene_set_enrichment(sig_dge_down, 'mmusculus',
gost_plot_down_path, write_output,
'Significantly Downregulated Pathways')Quality Control
To assist with QC, a few other plots are generated
- Principal Component Analysis and cluster dendograms can be useful
for identifying if the differences between the cohorts in the experiment
are high
- The filtering/normalization violin plots help identify imbalanced
samples, as well as identify the impact of filtering and
normalization
- The shape of the mean/variance trend line can be informative for determining if more low count genes should be filtered out
Principal Component Analysis
pca_metrics <- get_pca_metrics(log_cpm_filt_norm)
plot_pca_scatter(pca_metrics, sample_dimensions, study_design,
pca_scatter_out_path, write_output)plot_pca_small_multiples(pca_metrics, sample_dimensions, study_design,
pca_small_multiples_out_path, write_output)Sample Cluster Dendogram
plot_sample_cluster_dendogram(log_cpm_filt_norm, sample_labels,
sample_cluster_out_path, write_output)# sleep to stop knitr bug where plots repeat
Sys.sleep(5)Filtering and Normalization Impact
plot_impact_of_filtering_and_normalizing(dge_list, dge_list_filt,
dge_list_filt_norm,
control_label,
filtering_and_normalizing_impact_out_path,
write_output)Mean/Variance Distribution and Trend Line
plot_mean_variance_distribution(mean_variance_weights,
mean_variance_plot_out_path,
write_output)